Mango Masterpiece

Author

Aaron Toth

Published

March 10, 2024

## trying the r package!
library(vegabrite)
library(tidyverse)
Warning: package 'purrr' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

The data for this visualization come from FAOSTAT, published by the Food and Agriculture Administration.

## reading in the CSV
mango_data = read_csv("data/UNdata_Export_20240311_114817217.csv")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 32562 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Country or Area, Element, Unit, Value Footnotes
dbl (2): Year, Value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## tidying data in r 
mango_data_wide <- mango_data %>% 
  select('Country or Area', 'Element', 'Year', 'Value') %>% 
  drop_na() %>%
  pivot_wider(names_from = 'Element', values_from = 'Value') %>%
  rename('production_index_per_capita' = 'Gross per capita Production Index Number (2014-2016 = 100)',
         'production_index' = 'Gross Production Index Number (2014-2016 = 100)') %>% 
  filter(`Area harvested` > 0) 

mango_data_wide_country <- mango_data_wide %>% 
  rename('Country' = 'Country or Area') %>% 
  filter(!Country %in% c('World', 'Asia', 'Africa', 'Americas', 'Australia and New Zealand', 'Central America', 'Other non-specified areas', 'China, mainland', 'Land Locked Developing Countries', 'Least Developed Countries', 'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Northern Africa', 'Northern America', 'Oceania', 'Polynesia', 'Puerto Rico', 'Réunion', 'Small Island Developing States', 'South America', 'South-eastern Asia', 'Southern Africa', 'Southern Asia', 'Western Africa', 'Western Asia', 'Caribbean', 'Eastern Asia', 'Eastern Africa'))

mango_data_wide_region <- mango_data_wide %>% 
  rename('Region' = 'Country or Area') %>% 
  filter(Region %in% c('Central America', 'South America', 'South-eastern Asia', 'Southern Asia', 'Western Africa', 'Eastern Asia', 'Eastern Africa', 'Southern Africa', 'Northern Africa', 'Northern America'))  ## only regions where mangoes are grown

mango_data_wide_region
# A tibble: 601 × 7
   Region   Year production_index production_index_per…¹ `Area harvested`  Yield
   <chr>   <dbl>            <dbl>                  <dbl>            <dbl>  <dbl>
 1 Centra…  2021            118.                   111.            231130 113092
 2 Centra…  2020            115.                   110.            229946 111039
 3 Centra…  2019            118.                   112.            234354 110896
 4 Centra…  2018            107.                   104.            229799 103070
 5 Centra…  2017            113.                   110.            230106 108519
 6 Centra…  2016            109.                   108.            225716 106642
 7 Centra…  2015            103.                   103.            221512 102563
 8 Centra…  2014             88.4                   89.6           214381  91235
 9 Centra…  2013             96.1                   98.6           221167  96076
10 Centra…  2012             91.9                   95.7           225842  90012
# ℹ 591 more rows
# ℹ abbreviated name: ¹​production_index_per_capita
# ℹ 1 more variable: Production <dbl>

I have some timseries data here. As Wilke writes:

“When making scatter plots or time series , we are often more interested in the overarching trend of the data than in the specific detail of where each individual data point lies”

This led me to connect my timeseries and scatterplot data, rather than just presenting the points. Wilke is also a fan of “smoothing,” to show the “big picture,” so I will try that as well.

## making my mango area chart
mango_area <- vl_chart() %>% 
  vl_encode_y('Production', type = "quantitative") %>% 
  vl_add_data(mango_data_wide_region) %>%
  vl_mark_area(fillOpacity = 0.7) %>% 
  vl_encode_x('Year', type = "ordinal") %>%
  vl_axis_x(title= '', labels = FALSE) %>% 
  vl_encode_fill('Region:N') %>% 
  vl_encode_tooltip('Region') %>%
  vl_add_properties(title= "South Asia Historically Dominates Mango Production | FAOSTAT",
                    width = 570,
                    height = 300)
  
mango_bar <- vl_chart() %>% 
  vl_add_data(mango_data_wide_region) %>%
  vl_mark_bar() %>% 
  vl_encode_x('Year', type = "ordinal") %>%
  vl_encode_y('Area harvested', type = "quantitative") %>% 
  vl_encode_fill('Region:N') %>% 
  vl_encode_tooltip('Region') %>%
  vl_add_properties(width = 570,
                    height = 80)

vl_vconcat(mango_area, mango_bar)

I also started messing around with some of of the regression features after starting with the visualization There is definitely a better way to code this…

## DURING THE GREEN REVOLUTION ##

rev_scatterplot <- vl_chart() %>% 
  vl_add_data(mango_data_wide_country) %>%
  vl_filter('datum.Year < 1986') %>% 
  vl_mark_point() %>% 
  ## encode x
  vl_encode_x('Area harvested', type = "quantitative") %>%
  vl_scale_x(type = 'log') %>% 
  ## encode y
  vl_encode_y('Production', type = "quantitative") %>%
  vl_scale_y(type = 'log', domainMax = 100000000) %>% 
  ## set default color
  vl_encode_color(value = 'lightgray') %>%
  vl_encode_tooltip('Country') %>%
  vl_encode_opacity(value = 0.2) %>% 
  ## removing gridlines
  vl_axis_x(grid = FALSE) %>%
  vl_axis_y(grid = FALSE)

rev_regression <- vl_chart() %>%
  vl_add_data(mango_data_wide_country) %>%
  vl_filter('datum.Year < 1986') %>% 
  vl_regression(regression = 'Production', on = 'Area harvested', method = 'pow') %>%
  vl_encode_y('Production', type = "quantitative") %>%
  vl_encode_x('Area harvested', type = "quantitative") %>%
  vl_mark_line(color = 'firebrick')  

green_revolution <- vl_layer(rev_scatterplot, rev_regression) %>%
  vl_add_properties(height= 200, width = 200)
## AFTER THE GREEN REVOLUTION ##

postrev_scatterplot <- vl_chart() %>% 
  vl_add_data(mango_data_wide_country) %>%
  vl_filter('datum.Year > 1986') %>% 
  vl_mark_point() %>% 
  ## encode x
  vl_encode_x('Area harvested', type = "quantitative") %>%
  vl_scale_x(type = 'log') %>% 
  ## encode y
  vl_encode_y('Production', type = "quantitative") %>%
  vl_scale_y(type = 'log', domainMax = 100000000) %>% 
  ## set default color
  vl_encode_color(value = 'lightgray') %>%
  vl_encode_tooltip('Country') %>%
  vl_encode_opacity(value = 0.2) %>% 
  ## removing gridlines
  vl_axis_x(grid = FALSE) %>%
  vl_axis_y(grid = FALSE)

postrev_regression <- vl_chart() %>%
  vl_add_data(mango_data_wide_country) %>%
  vl_filter('datum.Year > 1986') %>% 
  vl_regression(regression = 'Production', on = 'Area harvested', method = 'pow') %>%
  vl_encode_y('Production', type = "quantitative") %>%
  vl_encode_x('Area harvested', type = "quantitative") %>%
  vl_mark_line(color = 'firebrick')  

post_green_revolution <- vl_layer(postrev_scatterplot, postrev_regression) %>%
  vl_add_properties(height= 200, width = 200)


vl_hconcat(green_revolution, post_green_revolution)